Classifying Users

Classifying Users#

import csv 
import sqlite3
import pandas as pd
df=pd.read_csv('/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns.csv')
df.dropna(inplace=True)
df.to_csv('/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv')
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 2
      1 import pandas as pd
----> 2 df=pd.read_csv('/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns.csv')
      3 df.dropna(inplace=True)
      4 df.to_csv('/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv')

File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    868 elif isinstance(handle, str):
    869     # Check whether the filename is to be opened in binary mode.
    870     # Binary mode does not support 'encoding' and 'newline'.
    871     if ioargs.encoding and "b" not in ioargs.mode:
    872         # Encoding
--> 873         handle = open(
    874             handle,
    875             ioargs.mode,
    876             encoding=ioargs.encoding,
    877             errors=errors,
    878             newline="",
    879         )
    880     else:
    881         # Binary mode
    882         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns.csv'
data_path='/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv'
conn=sqlite3.connect('ev_chargingdb')
ev_cursor=conn.cursor()

Step-1 Creating Normalized Database

# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')
ev_cursor = conn.cursor()

# Drop existing tables (if any)
drop_queries = [
    "DROP TABLE IF EXISTS Users;",
    "DROP TABLE IF EXISTS ChargingStations;",
    "DROP TABLE IF EXISTS ChargingSessions;",
    "DROP TABLE IF EXISTS EnvironmentalData;"
]

for query in drop_queries:
    ev_cursor.execute(query)
    conn.commit()

# Create Users Table
create_users_table = """
CREATE TABLE Users (
    user_id VARCHAR(50) PRIMARY KEY,
    user_type VARCHAR(50),
    vehicle_model VARCHAR(50),
    vehicle_age_years FLOAT
);
"""
ev_cursor.execute(create_users_table)
conn.commit()
print("Users table created successfully.")

# Create ChargingStations Table
create_station_table = """
CREATE TABLE ChargingStations (
    station_id VARCHAR(50) PRIMARY KEY,
    station_location VARCHAR(100),
    charger_type VARCHAR(50)
);
"""
ev_cursor.execute(create_station_table)
conn.commit()
print("ChargingStations table created successfully.")

# Create ChargingSessions Table (with foreign keys)
create_sessions_table = """
CREATE TABLE ChargingSessions (
    session_id INTEGER PRIMARY KEY AUTOINCREMENT,
    user_id VARCHAR(50),
    station_id VARCHAR(50),
    start_time TIMESTAMP,
    end_time TIMESTAMP,
    duration_hours FLOAT,
    energy_consumed_kwh FLOAT,
    charging_cost_usd FLOAT,
    charging_rate_kw FLOAT,
    soc_start_percent FLOAT,
    soc_end_percent FLOAT,
    time_of_day VARCHAR(50),
    day_of_week VARCHAR(50),
    FOREIGN KEY(user_id) REFERENCES Users(user_id),
    FOREIGN KEY(station_id) REFERENCES ChargingStations(station_id)
);
"""
ev_cursor.execute(create_sessions_table)
conn.commit()
print("ChargingSessions table created successfully.")

# Create EnvironmentalData Table (with foreign key)
create_env_data_table = """
CREATE TABLE EnvironmentalData (
    session_id INTEGER,
    distance_driven_km FLOAT,
    temperature_c FLOAT,
    battery_capacity_kwh FLOAT,
    PRIMARY KEY(session_id),
    FOREIGN KEY(session_id) REFERENCES ChargingSessions(session_id)
);
"""
ev_cursor.execute(create_env_data_table)
conn.commit()
print("EnvironmentalData table created successfully.")

# Close connection
conn.close()
Users table created successfully.
ChargingStations table created successfully.
ChargingSessions table created successfully.
EnvironmentalData table created successfully.
import csv
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')
ev_cursor = conn.cursor()

# CSV path
data_path = '/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv'

insert_user_query = """
INSERT OR IGNORE INTO Users (user_id, user_type, vehicle_model, vehicle_age_years)
VALUES (?, ?, ?, ?);
"""

insert_station_query = """
INSERT OR IGNORE INTO ChargingStations (station_id, station_location, charger_type)
VALUES (?, ?, ?);
"""

insert_sessions_query = """
INSERT INTO ChargingSessions (user_id, station_id, start_time, end_time, duration_hours,
    energy_consumed_kwh, charging_cost_usd, charging_rate_kw,
    soc_start_percent, soc_end_percent, time_of_day, day_of_week)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
"""

insert_env_data_query = """
INSERT INTO EnvironmentalData (session_id, distance_driven_km, temperature_c, battery_capacity_kwh)
VALUES (?, ?, ?, ?);
"""

# Open CSV and insert data
with open(data_path, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        try:
            # Insert user data (ignores if already exists)
            ev_cursor.execute(insert_user_query, (
                row['User ID'],
                row['User Type'],
                row['Vehicle Model'],
                float(row['Vehicle Age (years)'])
            ))

            # Insert station data (ignores if already exists)
            ev_cursor.execute(insert_station_query, (
                row['Charging Station ID'],
                row['Charging Station Location'],
                row['Charger Type']
            ))

            # Insert session data
            ev_cursor.execute(insert_sessions_query, (
                row['User ID'],
                row['Charging Station ID'],
                row['Charging Start Time'],
                row['Charging End Time'],
                float(row['Charging Duration (hours)']),
                float(row['Energy Consumed (kWh)']),
                float(row['Charging Cost (USD)']),
                float(row['Charging Rate (kW)']),
                float(row['State of Charge (Start %)']),
                float(row['State of Charge (End %)']),
                row['Time of Day'],
                row['Day of Week']
            ))
            
            # Get the last inserted session ID
            session_id = ev_cursor.lastrowid

            # Insert environmental data using the session_id from the last insert
            ev_cursor.execute(insert_env_data_query, (
                session_id,
                float(row['Distance Driven (since last charge) (km)']),
                float(row['Temperature (°C)']),
                float(row['Battery Capacity (kWh)'])
            ))

            # Commit changes to the database
            conn.commit()

        except sqlite3.IntegrityError as e:
            print(f"Integrity error for User ID {row['User ID']}: {e}")
            conn.rollback()
        except Exception as e:
            print(f"Error for User ID {row['User ID']}: {e}")
            conn.rollback()

# Close connection
conn.close()
import pandas as pd
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')

# SQL query to join the tables and combine data
select_data_from_tables = """
SELECT 
    u.user_id,
    u.user_type,
    u.vehicle_model,
    u.vehicle_age_years,
    cs.station_id,
    cs.station_location,
    cs.charger_type,
    s.start_time,
    s.end_time,
    s.duration_hours,
    s.energy_consumed_kwh,
    s.charging_cost_usd,
    s.charging_rate_kw,
    s.soc_start_percent,
    s.soc_end_percent,
    s.time_of_day,
    s.day_of_week,
    e.distance_driven_km,
    e.temperature_c,
    e.battery_capacity_kwh
FROM Users u
JOIN ChargingSessions s ON u.user_id = s.user_id
JOIN ChargingStations cs ON s.station_id = cs.station_id
JOIN EnvironmentalData e ON s.session_id = e.session_id;
"""

# Execute query and load the data into a DataFrame
df = pd.read_sql_query(select_data_from_tables, conn)

# Display the DataFrame
print(df)

# Close the connection
conn.close()
        user_id               user_type vehicle_model  vehicle_age_years  \
0        User_1                Commuter        BMW i3                2.0   
1        User_2           Casual Driver  Hyundai Kona                3.0   
2        User_3                Commuter    Chevy Bolt                2.0   
3        User_4  Long-Distance Traveler  Hyundai Kona                1.0   
4        User_5  Long-Distance Traveler  Hyundai Kona                1.0   
...         ...                     ...           ...                ...   
1126  User_1316                Commuter   Nissan Leaf                7.0   
1127  User_1317           Casual Driver        BMW i3                4.0   
1128  User_1318                Commuter   Nissan Leaf                5.0   
1129  User_1319                Commuter    Chevy Bolt                5.0   
1130  User_1320                Commuter   Nissan Leaf                5.0   

       station_id station_location     charger_type           start_time  \
0     Station_391          Houston  DC Fast Charger  2024-01-01 00:00:00   
1     Station_428    San Francisco          Level 1  2024-01-01 01:00:00   
2     Station_181    San Francisco          Level 2  2024-01-01 02:00:00   
3     Station_327          Houston          Level 1  2024-01-01 03:00:00   
4     Station_108      Los Angeles          Level 1  2024-01-01 04:00:00   
...           ...              ...              ...                  ...   
1126   Station_57         New York          Level 2  2024-02-24 19:00:00   
1127   Station_40         New York          Level 1  2024-02-24 20:00:00   
1128  Station_374         New York  DC Fast Charger  2024-02-24 21:00:00   
1129  Station_336          Chicago  DC Fast Charger  2024-02-24 22:00:00   
1130  Station_128    San Francisco          Level 1  2024-02-24 23:00:00   

                 end_time  duration_hours  energy_consumed_kwh  \
0     2024-01-01 00:39:00        0.591363            60.712346   
1     2024-01-01 03:01:00        3.133652            12.339275   
2     2024-01-01 04:48:00        2.452653            19.128876   
3     2024-01-01 06:42:00        1.266431            79.457824   
4     2024-01-01 05:46:00        2.019765            19.629104   
...                   ...             ...                  ...   
1126  2024-02-24 20:30:00        1.426444            42.011654   
1127  2024-02-24 20:44:00        3.238212            68.185853   
1128  2024-02-24 23:03:00        3.267122            18.895102   
1129  2024-02-24 23:20:00        2.754527            13.756252   
1130  2024-02-24 23:56:00        3.740970            63.652570   

      charging_cost_usd  charging_rate_kw  soc_start_percent  soc_end_percent  \
0             13.087717         36.389181          29.371576        86.119962   
1             21.128448         30.677735          10.115778        84.664344   
2             35.667270         27.513593           6.854604        69.917615   
3             13.036239         32.882870          83.120003        99.624328   
4             10.161471         10.215712          54.258950        63.743786   
...                 ...               ...                ...              ...   
1126          22.081164          5.895475          39.204102        83.915952   
1127           5.067806         18.388012          31.456375        93.096461   
1128          37.255002         45.482066          71.903081        78.678879   
1129          39.046146         38.148183          76.187997        65.926573   
1130          10.863674         33.704226          59.338076        56.692439   

     time_of_day day_of_week  distance_driven_km  temperature_c  \
0        Evening     Tuesday          293.602111      27.947953   
1        Morning      Monday          112.112804      14.311026   
2        Morning    Thursday           71.799253      21.002002   
3        Evening    Saturday          199.577785      38.316313   
4        Morning    Saturday          203.661847      -7.834199   
...          ...         ...                 ...            ...   
1126     Evening      Sunday          239.601075       1.919655   
1127     Evening     Tuesday          164.376022      34.029775   
1128     Evening     Tuesday          226.519258      20.358761   
1129   Afternoon      Sunday          291.494076      24.134598   
1130     Evening      Monday           14.449236      -6.966593   

      battery_capacity_kwh  
0               108.463007  
1               100.000000  
2                75.000000  
3                50.000000  
4                50.000000  
...                    ...  
1126            100.000000  
1127            100.000000  
1128            100.000000  
1129             85.000000  
1130            120.447195  

[1131 rows x 20 columns]

Verifying the data retreived from the database

df.head()
user_id user_type vehicle_model vehicle_age_years station_id station_location charger_type start_time end_time duration_hours energy_consumed_kwh charging_cost_usd charging_rate_kw soc_start_percent soc_end_percent time_of_day day_of_week distance_driven_km temperature_c battery_capacity_kwh
0 User_1 Commuter BMW i3 2.0 Station_391 Houston DC Fast Charger 2024-01-01 00:00:00 2024-01-01 00:39:00 0.591363 60.712346 13.087717 36.389181 29.371576 86.119962 Evening Tuesday 293.602111 27.947953 108.463007
1 User_2 Casual Driver Hyundai Kona 3.0 Station_428 San Francisco Level 1 2024-01-01 01:00:00 2024-01-01 03:01:00 3.133652 12.339275 21.128448 30.677735 10.115778 84.664344 Morning Monday 112.112804 14.311026 100.000000
2 User_3 Commuter Chevy Bolt 2.0 Station_181 San Francisco Level 2 2024-01-01 02:00:00 2024-01-01 04:48:00 2.452653 19.128876 35.667270 27.513593 6.854604 69.917615 Morning Thursday 71.799253 21.002002 75.000000
3 User_4 Long-Distance Traveler Hyundai Kona 1.0 Station_327 Houston Level 1 2024-01-01 03:00:00 2024-01-01 06:42:00 1.266431 79.457824 13.036239 32.882870 83.120003 99.624328 Evening Saturday 199.577785 38.316313 50.000000
4 User_5 Long-Distance Traveler Hyundai Kona 1.0 Station_108 Los Angeles Level 1 2024-01-01 04:00:00 2024-01-01 05:46:00 2.019765 19.629104 10.161471 10.215712 54.258950 63.743786 Morning Saturday 203.661847 -7.834199 50.000000
df.dropna(inplace=True)
df.isna().sum()
user_id                 0
user_type               0
vehicle_model           0
vehicle_age_years       0
station_id              0
station_location        0
charger_type            0
start_time              0
end_time                0
duration_hours          0
energy_consumed_kwh     0
charging_cost_usd       0
charging_rate_kw        0
soc_start_percent       0
soc_end_percent         0
time_of_day             0
day_of_week             0
distance_driven_km      0
temperature_c           0
battery_capacity_kwh    0
dtype: int64
%pip install scikit-learn
%pip install seaborn
Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (1.6.0)
Requirement already satisfied: numpy>=1.19.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (2.0.2)
Requirement already satisfied: scipy>=1.6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (3.5.0)
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: seaborn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.13.2)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (2.0.2)
Requirement already satisfied: pandas>=1.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (2.2.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1)
Requirement already satisfied: pillow>=8 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: six>=1.5 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
X = df.drop(columns=['user_type'])
y = df['user_type']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=None, random_state=42
)

Stratifying the data/ target variable

user_type_distribution = df['user_type'].value_counts(normalize=True)
print("Original dataset 'User Type' distribution:")
print(user_type_distribution)

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['user_type'], random_state=42)

train_distribution = train_df['user_type'].value_counts(normalize=True)
test_distribution = test_df['user_type'].value_counts(normalize=True)

print("\nTrain set 'User Type' distribution:")
print(train_distribution)

print("\nTest set 'User Type' distribution:")
print(test_distribution)
Original dataset 'User Type' distribution:
user_type
Commuter                  0.357206
Long-Distance Traveler    0.336870
Casual Driver             0.305924
Name: proportion, dtype: float64

Train set 'User Type' distribution:
user_type
Commuter                  0.357301
Long-Distance Traveler    0.336283
Casual Driver             0.306416
Name: proportion, dtype: float64

Test set 'User Type' distribution:
user_type
Commuter                  0.356828
Long-Distance Traveler    0.339207
Casual Driver             0.303965
Name: proportion, dtype: float64
pip install ipywidgets
Requirement already satisfied: ipywidgets in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (8.1.5)
Requirement already satisfied: comm>=0.1.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (0.2.2)
Requirement already satisfied: ipython>=6.1.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (8.26.0)
Requirement already satisfied: traitlets>=4.3.1 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (5.14.3)
Requirement already satisfied: widgetsnbextension~=4.0.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from ipywidgets) (4.0.13)
Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from ipywidgets) (3.0.13)
Requirement already satisfied: decorator in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)
Requirement already satisfied: jedi>=0.16 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.19.1)
Requirement already satisfied: matplotlib-inline in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)
Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.47)
Requirement already satisfied: pygments>=2.4.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (2.18.0)
Requirement already satisfied: stack-data in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)
Requirement already satisfied: pexpect>4.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (4.9.0)
Requirement already satisfied: parso<0.9.0,>=0.8.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.4)
Requirement already satisfied: ptyprocess>=0.5 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)
Requirement already satisfied: wcwidth in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)
Requirement already satisfied: executing>=1.2.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.1)
Requirement already satisfied: asttokens>=2.1.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.4.1)
Requirement already satisfied: pure-eval in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.3)
Requirement already satisfied: six>=1.12.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

Exploratory Data Analysis

from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Data Profile Report", explorative=True)
profile.to_file("ev_charging_patterns_profile_report.html")

profile.to_notebook_iframe()

“”” Observations:

  1. Missing Values The dataset contains 66 missing values, which accounts for approximately 0.3% of the data. Missing values are present in the following columns: ‘Energy Consumed (kWh)’,’Charging Rate (kW)’,’Distance Driven (since last charge) (km)’

  2. Duplicates A thorough check reveals that there are no duplicate rows in the dataset, indicating data uniqueness.

  3. Categorical Variables The dataset includes the following categorical variables: ‘User Type’: Represents the type of user which is our traget ariable ‘Vehicle Model’: Indicates the vehicle model being used. ‘Charger Type’: Specifies the type of charger employed during the session. ‘Charging Station Location’: Describes the location of the charging station. All these variables have low cardinality (each contains between 3 and 5 unique values), making them manageable for encoding or analysis.

  4. Numerical Variables Some inconsistencies and unusual patterns were identified: ‘State of Charge (Start %)’ and ‘State of Charge (End %)’: Both columns have values exceeding 100%, which is logically inconsistent and requires correction or capping.

‘Temperature (°C)’: The recorded temperatures range from -10.72°C to 73.17°C, suggesting potential outliers that need further investigation.

Strong correlations observed: ‘Energy Consumed (kWh)’ and ‘Charging Duration (hours)’ have a strong positive correlation (0.95). ‘Battery Capacity (kWh)’ and ‘Charging Rate (kW)’ show a moderate positive correlation (0.68).

  1. Date/Time Variables

The dataset includes two datetime variables: ‘Charging Start Time’,’Charging End Time’ These variables can be leveraged to derive new features like charging duration, time of day, or day of the week.

  1. Distributions

Some numerical features show distinct patterns: ‘Charging Duration (hours)’ and ‘Charging Cost (USD)’: Both exhibit right-skewed distributions, indicating a few sessions with unusually high values.

“””

df.head()
user_id user_type vehicle_model vehicle_age_years station_id station_location charger_type start_time end_time duration_hours energy_consumed_kwh charging_cost_usd charging_rate_kw soc_start_percent soc_end_percent time_of_day day_of_week distance_driven_km temperature_c battery_capacity_kwh
0 User_1 Commuter BMW i3 2.0 Station_391 Houston DC Fast Charger 2024-01-01 00:00:00 2024-01-01 00:39:00 0.591363 60.712346 13.087717 36.389181 29.371576 86.119962 Evening Tuesday 293.602111 27.947953 108.463007
1 User_2 Casual Driver Hyundai Kona 3.0 Station_428 San Francisco Level 1 2024-01-01 01:00:00 2024-01-01 03:01:00 3.133652 12.339275 21.128448 30.677735 10.115778 84.664344 Morning Monday 112.112804 14.311026 100.000000
2 User_3 Commuter Chevy Bolt 2.0 Station_181 San Francisco Level 2 2024-01-01 02:00:00 2024-01-01 04:48:00 2.452653 19.128876 35.667270 27.513593 6.854604 69.917615 Morning Thursday 71.799253 21.002002 75.000000
3 User_4 Long-Distance Traveler Hyundai Kona 1.0 Station_327 Houston Level 1 2024-01-01 03:00:00 2024-01-01 06:42:00 1.266431 79.457824 13.036239 32.882870 83.120003 99.624328 Evening Saturday 199.577785 38.316313 50.000000
4 User_5 Long-Distance Traveler Hyundai Kona 1.0 Station_108 Los Angeles Level 1 2024-01-01 04:00:00 2024-01-01 05:46:00 2.019765 19.629104 10.161471 10.215712 54.258950 63.743786 Morning Saturday 203.661847 -7.834199 50.000000
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1131 entries, 0 to 1130
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_id               1131 non-null   object 
 1   user_type             1131 non-null   object 
 2   vehicle_model         1131 non-null   object 
 3   vehicle_age_years     1131 non-null   float64
 4   station_id            1131 non-null   object 
 5   station_location      1131 non-null   object 
 6   charger_type          1131 non-null   object 
 7   start_time            1131 non-null   object 
 8   end_time              1131 non-null   object 
 9   duration_hours        1131 non-null   float64
 10  energy_consumed_kwh   1131 non-null   float64
 11  charging_cost_usd     1131 non-null   float64
 12  charging_rate_kw      1131 non-null   float64
 13  soc_start_percent     1131 non-null   float64
 14  soc_end_percent       1131 non-null   float64
 15  time_of_day           1131 non-null   object 
 16  day_of_week           1131 non-null   object 
 17  distance_driven_km    1131 non-null   float64
 18  temperature_c         1131 non-null   float64
 19  battery_capacity_kwh  1131 non-null   float64
dtypes: float64(10), object(10)
memory usage: 176.8+ KB
df.describe()
vehicle_age_years duration_hours energy_consumed_kwh charging_cost_usd charging_rate_kw soc_start_percent soc_end_percent distance_driven_km temperature_c battery_capacity_kwh
count 1131.000000 1131.000000 1131.000000 1131.000000 1131.000000 1131.000000 1131.000000 1131.000000 1131.000000 1131.000000
mean 3.604227 2.303177 42.915668 22.488351 26.014166 49.230036 75.012917 153.663101 15.305780 74.427818
std 2.324090 1.065878 22.201286 10.792504 14.010292 24.170435 16.920463 85.549751 14.751266 20.828350
min 0.000000 0.095314 0.045772 0.307085 1.472549 2.325959 7.604224 1.899538 -10.724770 1.536540
25% 2.000000 1.425281 24.248936 13.133925 13.949809 27.661992 62.264460 80.954993 3.009498 62.000000
50% 4.000000 2.312675 42.865611 21.828088 25.838488 48.947886 75.100944 152.257515 14.641853 75.000000
75% 6.000000 3.145998 61.544055 31.675804 37.508677 69.783816 88.245070 225.469628 27.824244 85.000000
max 11.688592 7.635145 152.238758 69.407743 97.342255 125.087227 177.708666 398.364775 73.169588 193.003074
df.dtypes
user_id                  object
user_type                object
vehicle_model            object
vehicle_age_years       float64
station_id               object
station_location         object
charger_type             object
start_time               object
end_time                 object
duration_hours          float64
energy_consumed_kwh     float64
charging_cost_usd       float64
charging_rate_kw        float64
soc_start_percent       float64
soc_end_percent         float64
time_of_day              object
day_of_week              object
distance_driven_km      float64
temperature_c           float64
battery_capacity_kwh    float64
dtype: object
df.duplicated().sum()
np.int64(0)
df.isna().sum()
user_id                 0
user_type               0
vehicle_model           0
vehicle_age_years       0
station_id              0
station_location        0
charger_type            0
start_time              0
end_time                0
duration_hours          0
energy_consumed_kwh     0
charging_cost_usd       0
charging_rate_kw        0
soc_start_percent       0
soc_end_percent         0
time_of_day             0
day_of_week             0
distance_driven_km      0
temperature_c           0
battery_capacity_kwh    0
dtype: int64
df.dropna(subset=['vehicle_model','duration_hours','end_time'], inplace=True)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Pie chart for the distribution of user types
plt.figure(figsize=(8, 5))
pie_plot = df['user_type'].value_counts().plot.pie(
    autopct='%1.1f%%', 
    startangle=90, 
    colors=sns.color_palette('Set2'),
    fontsize=12
)
plt.title('Distribution of User Types', fontsize=16)
plt.show()
_images/8416040824320bd61e2f1433881a2b3dd915690677f459c0ffba512ce46b3c66.png
%pip install -q dagshub mlflow
Note: you may need to restart the kernel to use updated packages.

Experiment 1

import dagshub
dagshub.init(repo_owner='saisatvikh', repo_name='final_repo', mlflow=True)
Accessing as saisatvikh
Initialized MLflow to track repo "saisatvikh/final_repo"
Repository saisatvikh/final_repo initialized!
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score
categorical_features = X.select_dtypes(include=['object']).columns
for col in categorical_features:
    X[col] = X[col].astype(str)
    
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler()) 
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=10, scoring='f1_macro')
print(f"Cross-validation F1-score (mean): {cv_results.mean():.4f}")
print(f"Cross-validation F1-score (std): {cv_results.std():.4f}")
Cross-validation F1-score (mean): 0.3656
Cross-validation F1-score (std): 0.0559
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
       'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
       'soc_end_percent', 'distance_driven_km', 'temperature_c',
       'battery_capacity_kwh'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
       'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
      dtype='object'))])),
                ('classifier', LogisticRegression(solver='liblinear'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = model_pipeline.predict(X_test)
logistic_f1_score= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {logistic_f1_score:.4f}")
F1-score on test data: 0.3007
cm = confusion_matrix(y_test, y_pred)
tp = cm.diagonal()
tn = cm.sum(axis=1) - tp
fp = cm.sum(axis=0) - tp
fn = cm.sum() - (tp + fp + tn)
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np

with mlflow.start_run(nested=True):
    # Log model and scaler as parameters
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_param("scaler", "StandardScaler")
  
    # Cross-validation to compute the mean and std of f1 score
    cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
    
    mlflow.log_metric("cv_f1_mean", cv_results.mean())
    mlflow.log_metric("cv_f1_std", cv_results.std())
    
    # Train the model
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    # Compute F1 score (macro average)
    f1 = f1_score(y_test, y_pred, average='macro')
    mlflow.log_metric("f1_test", f1)

    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Choose a single class (e.g., class 0)
    class_index = 0  # Change this to select a different class if needed

    # Extract metrics for the selected class
    tp = cm[class_index, class_index]  # True Positive: Diagonal element for the class
    fn = cm[class_index].sum() - tp  # False Negative: Sum of row - TP
    fp = cm[:, class_index].sum() - tp  # False Positive: Sum of column - TP
    tn = cm.sum() - (tp + fn + fp)  # True Negative: Total sum - (TP + FN + FP)

    # Log the metrics for the selected class (e.g., class 0)
    mlflow.log_metric(f"TP_class_{class_index}", tp)
    mlflow.log_metric(f"TN_class_{class_index}", tn)
    mlflow.log_metric(f"FP_class_{class_index}", fp)
    mlflow.log_metric(f"FN_class_{class_index}", fn)

    # Log the model
    mlflow.sklearn.log_model(model_pipeline, "logistic_regression_model")

    # Print the logged metrics for the selected class
    print(f"Logged F1 score to MLFlow: {f1:.4f}")
    print(f"Logged CV results (mean): {cv_results.mean():.4f}")
    print(f"Logged CV results (std): {cv_results.std():.4f}")
    
    # Optionally, print the confusion matrix values for the selected class
    print(f"Confusion Matrix:\n{cm}")
    print(f"True Positives (Class {class_index}): {tp}")
    print(f"True Negatives (Class {class_index}): {tn}")
    print(f"False Positives (Class {class_index}): {fp}")
    print(f"False Negatives (Class {class_index}): {fn}")
2024/12/20 16:11:37 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.3007
Logged CV results (mean): 0.3572
Logged CV results (std): 0.0186
Confusion Matrix:
[[18 47 45]
 [25 51 32]
 [32 53 37]]
True Positives (Class 0): 18
True Negatives (Class 0): 173
False Positives (Class 0): 57
False Negatives (Class 0): 92
🏃 View run colorful-asp-716 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/2473cb10a4e34cb0a266761edf1da0fa
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0

Ridge Classifier

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RidgeClassifier())
])
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
       'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
       'soc_end_percent', 'distance_driven_km', 'temperature_c',
       'battery_capacity_kwh'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
       'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
      dtype='object'))])),
                ('classifier', RidgeClassifier())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = model_pipeline.predict(X_test)
ridge_f1_score= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {ridge_f1_score:.4f}")
F1-score on test data: 0.3370
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np

with mlflow.start_run(nested=True):
    # Log model and scaler as parameters
    mlflow.log_param("model", "Ridge Classifier")
    mlflow.log_param("scaler", "StandardScaler")
  
    cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
    
    mlflow.log_metric("cv_f1_mean", cv_results.mean())
    mlflow.log_metric("cv_f1_std", cv_results.std())
    
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    mlflow.log_metric("f1_test", f1)

    cm = confusion_matrix(y_test, y_pred)
    class_index = 0  

    tp = cm[class_index, class_index] 
    fn = cm[class_index].sum() - tp  
    fp = cm[:, class_index].sum() - tp  
    tn = cm.sum() - (tp + fn + fp)  

    mlflow.log_metric(f"TP_class_{class_index}", tp)
    mlflow.log_metric(f"TN_class_{class_index}", tn)
    mlflow.log_metric(f"FP_class_{class_index}", fp)
    mlflow.log_metric(f"FN_class_{class_index}", fn)

    mlflow.sklearn.log_model(model_pipeline, "ridge_model")

    print(f"Logged F1 score to MLFlow: {f1:.4f}")
    print(f"Logged CV results (mean): {cv_results.mean():.4f}")
    print(f"Logged CV results (std): {cv_results.std():.4f}")
    
    print(f"Confusion Matrix:\n{cm}")
    print(f"True Positives (Class {class_index}): {tp}")
    print(f"True Negatives (Class {class_index}): {tn}")
    print(f"False Positives (Class {class_index}): {fp}")
    print(f"False Negatives (Class {class_index}): {fn}")
2024/12/20 16:11:54 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.3370
Logged CV results (mean): 0.3621
Logged CV results (std): 0.0105
Confusion Matrix:
[[25 48 37]
 [25 53 30]
 [35 48 39]]
True Positives (Class 0): 25
True Negatives (Class 0): 170
False Positives (Class 0): 60
False Negatives (Class 0): 85
🏃 View run incongruous-stag-353 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/fe3cab3a15b940a2856d4f2211e829cd
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0

Random Forest Classifier

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
       'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
       'soc_end_percent', 'distance_driven_km', 'temperature_c',
       'battery_capacity_kwh'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
       'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
      dtype='object'))])),
                ('classifier', RandomForestClassifier())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = model_pipeline.predict(X_test)
random_forest_f1= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {random_forest_f1:.4f}")
F1-score on test data: 0.2968
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np

with mlflow.start_run(nested=True):
    # Log model and scaler as parameters
    mlflow.log_param("model", "Random Forest Classifier")
    mlflow.log_param("scaler", "StandardScaler")
  
    # Cross-validation to compute the mean and std of f1 score
    cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
    
    mlflow.log_metric("cv_f1_mean", cv_results.mean())
    mlflow.log_metric("cv_f1_std", cv_results.std())
    
    # Train the model
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    # Compute F1 score (macro average)
    f1 = f1_score(y_test, y_pred, average='macro')
    mlflow.log_metric("f1_test", f1)

    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Choose a single class (e.g., class 0)
    class_index = 0  # Change this to select a different class if needed

    # Extract metrics for the selected class
    tp = cm[class_index, class_index]  # True Positive: Diagonal element for the class
    fn = cm[class_index].sum() - tp  # False Negative: Sum of row - TP
    fp = cm[:, class_index].sum() - tp  # False Positive: Sum of column - TP
    tn = cm.sum() - (tp + fn + fp)  # True Negative: Total sum - (TP + FN + FP)

    # Log the metrics for the selected class (e.g., class 0)
    mlflow.log_metric(f"TP_class_{class_index}", tp)
    mlflow.log_metric(f"TN_class_{class_index}", tn)
    mlflow.log_metric(f"FP_class_{class_index}", fp)
    mlflow.log_metric(f"FN_class_{class_index}", fn)

    # Log the model
    mlflow.sklearn.log_model(model_pipeline, "random_forest_model")

    # Print the logged metrics for the selected class
    print(f"Logged F1 score to MLFlow: {f1:.4f}")
    print(f"Logged CV results (mean): {cv_results.mean():.4f}")
    print(f"Logged CV results (std): {cv_results.std():.4f}")
    
    # Optionally, print the confusion matrix values for the selected class
    print(f"Confusion Matrix:\n{cm}")
    print(f"True Positives (Class {class_index}): {tp}")
    print(f"True Negatives (Class {class_index}): {tn}")
    print(f"False Positives (Class {class_index}): {fp}")
    print(f"False Negatives (Class {class_index}): {fn}")
2024/12/20 16:12:13 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.3060
Logged CV results (mean): 0.3255
Logged CV results (std): 0.0264
Confusion Matrix:
[[11 70 29]
 [11 78 19]
 [15 77 30]]
True Positives (Class 0): 11
True Negatives (Class 0): 204
False Positives (Class 0): 26
False Negatives (Class 0): 99
🏃 View run powerful-hawk-261 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/58fd4a3298504e74b452b11a6b9e28de
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0

Polynomial Fetaures

import mlflow
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

numeric_features = [
    'battery_capacity_kwh', 'charging_cost_usd', 
    'duration_hours', 'soc_end_percent', 'soc_start_percent', 
    'temperature_c', 'vehicle_age_years'
]

def add_polynomial_features(df, numeric_features, degree=2):
    numeric_data = df[numeric_features]
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    poly_features = poly.fit_transform(numeric_data)
    poly_feature_names = poly.get_feature_names_out(numeric_features)
    poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
    df = pd.concat([df, poly_df], axis=1)
    return df

df_with_poly_features = add_polynomial_features(df, numeric_features, degree=2)

mlflow.log_param("polynomial_degree", 2)
mlflow.log_param("numeric_features", numeric_features)

mlflow.log_metric("num_poly_features", len(df_with_poly_features.columns) - len(df.columns))

df_with_poly_features.to_csv("polynomial_features_dataset.csv", index=False)
mlflow.log_artifact("polynomial_features_dataset.csv")

print(df_with_poly_features.head())

print("Feature engineering results logged in MLflow.")
  user_id               user_type vehicle_model  vehicle_age_years  \
0  User_1                Commuter        BMW i3                2.0   
1  User_2           Casual Driver  Hyundai Kona                3.0   
2  User_3                Commuter    Chevy Bolt                2.0   
3  User_4  Long-Distance Traveler  Hyundai Kona                1.0   
4  User_5  Long-Distance Traveler  Hyundai Kona                1.0   

    station_id station_location     charger_type           start_time  \
0  Station_391          Houston  DC Fast Charger  2024-01-01 00:00:00   
1  Station_428    San Francisco          Level 1  2024-01-01 01:00:00   
2  Station_181    San Francisco          Level 2  2024-01-01 02:00:00   
3  Station_327          Houston          Level 1  2024-01-01 03:00:00   
4  Station_108      Los Angeles          Level 1  2024-01-01 04:00:00   

              end_time  duration_hours  ...  soc_end_percent^2  \
0  2024-01-01 00:39:00        0.591363  ...        7416.647931   
1  2024-01-01 03:01:00        3.133652  ...        7168.051082   
2  2024-01-01 04:48:00        2.452653  ...        4888.472921   
3  2024-01-01 06:42:00        1.266431  ...        9925.006656   
4  2024-01-01 05:46:00        2.019765  ...        4063.270258   

   soc_end_percent soc_start_percent  soc_end_percent temperature_c  \
0                        2529.479020                    2406.876668   
1                         856.445674                    1211.633594   
2                         479.257596                    1468.409884   
3                        8280.774408                    3817.236936   
4                        3458.670880                    -499.381507   

   soc_end_percent vehicle_age_years  soc_start_percent^2  \
0                         172.239925           862.689475   
1                         253.993031           102.328957   
2                         139.835230            46.985602   
3                          99.624328          6908.934892   
4                          63.743786          2944.033622   

  soc_start_percent temperature_c soc_start_percent vehicle_age_years  \
0                      820.875427                           58.743152   
1                      144.767153                           30.347333   
2                      143.960415                           13.709209   
3                     3184.852063                           83.120003   
4                     -425.075411                           54.258950   

   temperature_c^2  temperature_c vehicle_age_years  vehicle_age_years^2  
0       781.088080                        55.895906                  4.0  
1       204.805455                        42.933077                  9.0  
2       441.084081                        42.004004                  4.0  
3      1468.139854                        38.316313                  1.0  
4        61.374674                        -7.834199                  1.0  

[5 rows x 56 columns]
Feature engineering results logged in MLflow.

Attribute Combinations

import mlflow
import pandas as pd
import numpy as np

def ensure_numeric_columns(df, columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col], errors='coerce') 
    return df

def add_attribute_combinations(df):
    """Creates new features by combining existing attributes."""
    df = df.copy()  

    numeric_columns = [
        'energy_consumed_kwh', 'duration_hours', 'distance_driven_km', 
        'charging_cost_usd', 'soc_end_percent', 'soc_start_percent', 'temperature_c'
    ]

    # Ensure relevant columns are numeric
    df = ensure_numeric_columns(df, numeric_columns)

    # Add new features
    df['energy_per_duration'] = df['energy_consumed_kwh'] / df['duration_hours']
    df['distance_per_duration'] = df['distance_driven_km'] / df['duration_hours']
    df['charging_cost_per_kwh'] = df['charging_cost_usd'] / df['energy_consumed_kwh']
    df['soc_diff'] = df['soc_end_percent'] - df['soc_start_percent']
    df['temperature_adjusted_energy'] = df['energy_consumed_kwh'] / (1 + np.abs(df['temperature_c']))

    # Handle infinite or NaN values that may result from division
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    
    return df

# Add the features to the dataset
df_with_features = add_attribute_combinations(df)

# Log parameters for the feature combinations
mlflow.log_param("attribute_combination_features", [
    "energy_per_duration", "distance_per_duration", "charging_cost_per_kwh",
    "soc_diff", "temperature_adjusted_energy"
])

# Log metrics (e.g., number of new features created)
mlflow.log_metric("num_new_features", len(df_with_features.columns) - len(df.columns))

# Log the dataset with new features as an artifact
df_with_features.to_csv("attribute_combined_dataset.csv", index=False)
mlflow.log_artifact("attribute_combined_dataset.csv")

# Display the updated DataFrame
print(df_with_features.head())
mlflow.end_run()

print("Feature combinations logged in MLflow.")
  user_id               user_type vehicle_model  vehicle_age_years  \
0  User_1                Commuter        BMW i3                2.0   
1  User_2           Casual Driver  Hyundai Kona                3.0   
2  User_3                Commuter    Chevy Bolt                2.0   
3  User_4  Long-Distance Traveler  Hyundai Kona                1.0   
4  User_5  Long-Distance Traveler  Hyundai Kona                1.0   

    station_id station_location     charger_type           start_time  \
0  Station_391          Houston  DC Fast Charger  2024-01-01 00:00:00   
1  Station_428    San Francisco          Level 1  2024-01-01 01:00:00   
2  Station_181    San Francisco          Level 2  2024-01-01 02:00:00   
3  Station_327          Houston          Level 1  2024-01-01 03:00:00   
4  Station_108      Los Angeles          Level 1  2024-01-01 04:00:00   

              end_time  duration_hours  ...  day_of_week  distance_driven_km  \
0  2024-01-01 00:39:00        0.591363  ...      Tuesday          293.602111   
1  2024-01-01 03:01:00        3.133652  ...       Monday          112.112804   
2  2024-01-01 04:48:00        2.452653  ...     Thursday           71.799253   
3  2024-01-01 06:42:00        1.266431  ...     Saturday          199.577785   
4  2024-01-01 05:46:00        2.019765  ...     Saturday          203.661847   

   temperature_c  battery_capacity_kwh  charger_type_encoded  \
0      27.947953            108.463007                     0   
1      14.311026            100.000000                     1   
2      21.002002             75.000000                     2   
3      38.316313             50.000000                     1   
4      -7.834199             50.000000                     1   

  energy_per_duration distance_per_duration  charging_cost_per_kwh   soc_diff  \
0          102.665033            496.483377               0.215569  56.748386   
1            3.937666             35.777043               1.712292  74.548566   
2            7.799260             29.274121               1.864577  63.063011   
3           62.741544            157.590753               0.164065  16.504325   
4            9.718509            100.834423               0.517674   9.484836   

   temperature_adjusted_energy  
0                     2.097293  
1                     0.805908  
2                     0.869415  
3                     2.020989  
4                     2.221945  

[5 rows x 26 columns]
🏃 View run bald-smelt-249 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/f05524583bbc4ff599e401b6231dcf6c
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Feature combinations logged in MLflow.

Variance Threshold, Correlation Threshold, Fetaure Importaance

import pandas as pd
import numpy as np
import mlflow
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your dataset and 'target' is the target column
target = 'user_type'

# Start MLflow run
mlflow.start_run(run_name="Feature_Selection_Experiment")

# Split dataset into features and target
X = df.drop(columns=[target])
y = df[target]

# Log parameters
mlflow.log_param("target_column", target)

# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X.select_dtypes(exclude=['object', 'category']).columns.tolist()

# Log categorical and numerical features
mlflow.log_param("categorical_features", categorical_features)
mlflow.log_param("numeric_features", numeric_features)

# Preprocessing pipeline for categorical and numerical features
numeric_transformer = StandardScaler()

# Convert categorical columns to string to avoid mixed types
X[categorical_features] = X[categorical_features].map(str)

# Categorical transformer for one-hot encoding with sparse=False (dense output)
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
# Experiment 1: Feature selection using Correlation Threshold
def correlation_threshold(X, threshold=0.9):
    # Check if the data is a sparse matrix and convert it to dense if so
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X)
    elif hasattr(X, 'toarray'):  # If it's a sparse matrix, convert it
        X = pd.DataFrame(X.toarray())

    correlation_matrix = X.corr().abs()
    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    X_selected = X.drop(columns=to_drop)
    return X_selected
# Apply the preprocessor and correlation threshold
X_processed = preprocessor.fit_transform(X)
X_corr_selected = correlation_threshold(pd.DataFrame(X_processed), threshold=0.9)

# Log Correlation Threshold results
mlflow.log_param("correlation_threshold", 0.9)
mlflow.log_metric("num_features_after_correlation_threshold", X_corr_selected.shape[1])

# Experiment 2: Feature selection using Feature Importance (Random Forest)
def feature_importance(X, y):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    feature_importance = model.feature_importances_
    sorted_idx = np.argsort(feature_importance)[::-1]
    return X.iloc[:, sorted_idx]
# Apply the preprocessor and feature importance
X_fi_processed = preprocessor.fit_transform(X)
X_fi_selected = feature_importance(pd.DataFrame(X_fi_processed), y)

# Log Feature Importance results
mlflow.log_param("feature_importance_model", "Random Forest")
mlflow.log_metric("num_features_after_feature_importance", X_fi_selected.shape[1])

# Experiment 3: Feature selection using Variance Threshold
def variance_threshold(X, threshold=0.01):
    selector = VarianceThreshold(threshold=threshold)
    X_selected = selector.fit_transform(X)
    return X_selected

# Apply the preprocessor and variance threshold
X_var_processed = preprocessor.fit_transform(X)
X_var_selected = variance_threshold(pd.DataFrame(X_var_processed), threshold=0.01)

# Log Variance Threshold results
mlflow.log_param("variance_threshold", 0.01)
mlflow.log_metric("num_features_after_variance_threshold", X_var_selected.shape[1])

# Train and evaluate models to validate the selected features
def evaluate_model(X_selected, y):
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)
# Evaluate models after each feature selection method
accuracy_corr = evaluate_model(X_corr_selected, y)
accuracy_fi = evaluate_model(X_fi_selected, y)
accuracy_var = evaluate_model(X_var_selected, y)

# Log the model performance
mlflow.log_metric("accuracy_after_correlation_threshold", accuracy_corr)
mlflow.log_metric("accuracy_after_feature_importance", accuracy_fi)
mlflow.log_metric("accuracy_after_variance_threshold", accuracy_var)

# End the MLflow run
mlflow.end_run()

# Print summary
print(f"Accuracy after Correlation Threshold: {accuracy_corr:.4f}")
print(f"Accuracy after Feature Importance: {accuracy_fi:.4f}")
print(f"Accuracy after Variance Threshold: {accuracy_var:.4f}")

Priniple Component Analysis

import pandas as pd
import numpy as np
import mlflow
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming df is your dataset and 'user_type' is the target column
target = 'user_type'

# Start MLflow run
mlflow.start_run(run_name="PCA_Dimensionality_Reduction")

# Split dataset into features and target
X = df.drop(columns=[target])
y = df[target]

# Log parameters
mlflow.log_param("target_column", target)

# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X.select_dtypes(exclude=['object', 'category']).columns.tolist()

# Log categorical and numerical features
mlflow.log_param("categorical_features", categorical_features)
mlflow.log_param("numeric_features", numeric_features)

# Preprocessing pipeline for categorical and numerical features
numeric_transformer = StandardScaler()

# Convert categorical columns to string to avoid mixed types
X[categorical_features] = X[categorical_features].applymap(str)

# Categorical transformer for one-hot encoding with sparse_output=False (dense output)
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing to the features
X_processed = preprocessor.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA()
X_pca = pca.fit_transform(X_processed)

# Log the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
mlflow.log_param("explained_variance_ratio", explained_variance_ratio.tolist())

# Create a scree plot to visualize the explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title("Scree Plot: Explained Variance Ratio per Principal Component")
plt.xlabel("Principal Components")
plt.ylabel("Explained Variance Ratio")
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
scree_plot_path = "scree_plot.png"
plt.savefig(scree_plot_path)

# Show the plot in the output
plt.show()

# Log the scree plot in MLFlow
mlflow.log_artifact(scree_plot_path)

# Determine how many components to keep based on cumulative explained variance
cumulative_variance = np.cumsum(explained_variance_ratio)
mlflow.log_param("cumulative_variance", cumulative_variance.tolist())

# Log the number of components based on the desired explained variance threshold
threshold = 0.95  # Choose the threshold for explained variance
num_components = np.argmax(cumulative_variance >= threshold) + 1
mlflow.log_param("num_components_selected", num_components)

# Apply PCA with the selected number of components
pca_selected = PCA(n_components=num_components)
X_pca_selected = pca_selected.fit_transform(X_processed)

# Log results for the reduced dataset
mlflow.log_param("num_features_before_pca", X_processed.shape[1])
mlflow.log_param("num_features_after_pca", X_pca_selected.shape[1])

# Train and evaluate model on PCA-reduced features
def evaluate_model(X_selected, y):
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

accuracy_pca = evaluate_model(X_pca_selected, y)

# Log the model performance
mlflow.log_metric("accuracy_after_pca", accuracy_pca)

# End the MLflow run
mlflow.end_run()

# Print the results
print(f"Accuracy after PCA: {accuracy_pca:.4f}")
print(f"Number of components selected: {num_components}")
/var/folders/5q/38fn8x6x05j5m61tvnrj7t040000gn/T/ipykernel_5137/4008606804.py:38: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  X[categorical_features] = X[categorical_features].applymap(str)
_images/1d4b51a565989954762c75ed2134af84ab12ce216cdc87ef51bc702e976813d9.png
🏃 View run PCA_Dimensionality_Reduction at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/197ae3b24a484273951a835c661a6069
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Accuracy after PCA: 0.2952
Number of components selected: 788

Custome Experiment 2

import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np

with mlflow.start_run(nested=True):
    mlflow.log_param("model", "SVM Classifier")
    mlflow.log_param("scaler", "StandardScaler")
  
    cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
    
    mlflow.log_metric("cv_f1_mean", cv_results.mean())
    mlflow.log_metric("cv_f1_std", cv_results.std())
    
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    svm_f1_score= f1_score(y_test, y_pred, average='macro')
    mlflow.log_metric("f1_test", f1)

    cm = confusion_matrix(y_test, y_pred)

    class_index = 0   

    tp = cm[class_index, class_index] 
    fn = cm[class_index].sum() - tp  
    fp = cm[:, class_index].sum() - tp  
    tn = cm.sum() - (tp + fn + fp)  

    mlflow.log_metric(f"TP_class_{class_index}", tp)
    mlflow.log_metric(f"TN_class_{class_index}", tn)
    mlflow.log_metric(f"FP_class_{class_index}", fp)
    mlflow.log_metric(f"FN_class_{class_index}", fn)

    mlflow.sklearn.log_model(model_pipeline, "svm_model")

    print(f"Logged F1 score to MLFlow: {f1:.4f}")
    print(f"Logged CV results (mean): {cv_results.mean():.4f}")
    print(f"Logged CV results (std): {cv_results.std():.4f}")
    
    print(f"Confusion Matrix:\n{cm}")
    print(f"True Positives (Class {class_index}): {tp}")
    print(f"True Negatives (Class {class_index}): {tn}")
    print(f"False Positives (Class {class_index}): {fp}")
    print(f"False Negatives (Class {class_index}): {fn}")

Custom Experiment 2

import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns
imputer = SimpleImputer(strategy="median")
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_columns]), columns=numerical_columns)

label_encoder = LabelEncoder()
df['charger_type_encoded'] = label_encoder.fit_transform(df['charger_type'])

tsne = TSNE(n_components=2, random_state=42)
df_tsne = tsne.fit_transform(df_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(df_tsne[:, 0], df_tsne[:, 1], c=df['charger_type_encoded'], cmap='viridis', s=50, alpha=0.7)
plt.title('t-SNE visualization of the data')
plt.colorbar(label='user_type')
plt.show()

X = df_scaled
y = df['user_type']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'metric': ['euclidean', 'manhattan'],
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

with mlflow.start_run():
    mlflow.log_param("cv_folds", 5)
    mlflow.log_param("param_grid", param_grid)

    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_knn = grid_search.best_estimator_

    mlflow.log_params(best_params)

    y_pred = best_knn.predict(X_val)

    cm = confusion_matrix(y_val, y_pred)
    report = classification_report(y_val, y_pred, output_dict=True)
    
    mlflow.log_metric("accuracy", report["accuracy"])
    mlflow.log_metric("macro_avg_f1", report["macro avg"]["f1-score"])
    mlflow.log_metric("weighted_avg_f1", report["weighted avg"]["f1-score"])
    mlflow.log_metric("precision", report["macro avg"]["precision"])
    mlflow.log_metric("recall", report["macro avg"]["recall"])

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix")
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")

    mlflow.sklearn.log_model(best_knn, "knn_model")
    
    print(f"Best Parameters: {best_params}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Classification Report:\n{classification_report(y_val, y_pred)}")

F1- Scores Comparision

from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns

model_names = ['Logistic Regression','Random Forest','Ridge Classifier','SVM']
f1_scores = [logistic_f1_score,random_forest_f1,ridge_f1_score,svm_f1_score]


f1_scores_df = pd.DataFrame({
    'Model': model_names,
    'F1-Score': f1_scores
})

f1_df = f1_scores_df.sort_values(by='F1-Score', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='F1-Score', y='Model', data=f1_df, palette='viridis')

plt.title('Model Comparison Based on F1-Score', fontsize=16)
plt.xlabel('F1-Score', fontsize=14)
plt.ylabel('Model', fontsize=14)

plt.show()
/var/folders/5q/38fn8x6x05j5m61tvnrj7t040000gn/T/ipykernel_5137/261019361.py:17: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='F1-Score', y='Model', data=f1_df, palette='viridis')
_images/00a2660654d596a95992cc23f348b060cc6f8700da33637c7ae979d4a4a60800.png